Beijing, China - May 24-26, 2016

Outline

  • Session 1: Motivation, why and how to think about data, and getting started with R
  • Session 2: Making basic plots, grammar of graphics, good practices
  • Session 3: Advanced graphics, layering, using maps

(If you re-started RStudio, be sure to re-open your project too.)

Building charts piecewise

Back to Education - How to Make the Plots

Read the OECD PISA data

student2012.sub <- readRDS("../data/student_sub.rds")
dim(student2012.sub)
#> [1] 271323     50
student2012.sub$ST04Q01 <- factor(student2012.sub$ST04Q01, 
  levels=c(1,2), labels=c("Female", "Male"))

Calculate the statistics: Mean difference

student2012.stats <- student2012.sub %>% 
  group_by(CNT) %>%
  summarise(wmathgap=weighted.mean(PV1MATH[ST04Q01=="Male"], 
                  w=SENWGT_STU[ST04Q01=="Male"], na.rm=T)-
               weighted.mean(PV1MATH[ST04Q01=="Female"],
                  w=SENWGT_STU[ST04Q01=="Female"], na.rm=T))
kable(head(student2012.stats))
CNT wmathgap
ARE -5.1
AUS 12.3
AUT 21.9
BEL 11.4
BGR -2.1
BRA 21.1

Map

  • Means are point estimates
  • Map mean difference to point geom, using position along a line

Plot - Need to Order!

ggplot(data=student2012.stats) + 
  geom_point(aes(x=CNT, y=wmathgap), size=3) + 
 coord_flip() + theme_bw()

Arrange

  • Arrange: by country
  • Order by magnitude of difference

student2012.stats$CNT <- factor(student2012.stats$CNT, 
    levels=student2012.stats$CNT[order(student2012.stats$wmathgap)])
ggplot(data=student2012.stats) + 
  geom_point(aes(x=CNT, y=wmathgap), size=3) + 
 coord_flip() + theme_bw()

Inference

Create bootstrap confidence intervals for each mean difference

cifn <- function(d, i) {
  x <- d[i,]
  ci <- weighted.mean(x$PV1MATH[x$ST04Q01=="Male"], 
          w=x$SENWGT_STU[x$ST04Q01=="Male"], na.rm=T)-
        weighted.mean(x$PV1MATH[x$ST04Q01=="Female"],
          w=x$SENWGT_STU[x$ST04Q01=="Female"], na.rm=T)
  ci
}

bootfn <- function(d) {
  r <- boot(d, statistic=cifn, R=100)
  l <- sort(r$t)[5]
  u <- sort(r$t)[95]
  ci <- c(l, u)
  return(ci)
}

Apply ci functions to data

student2012.sub.summary.gap.boot <- student2012.sub %>% 
  split(.$CNT) %>% purrr::map(bootfn) %>% data.frame() %>%
  gather(CNT, value)
student2012.sub.summary.gap.boot$ci <- 
  rep(c("ml","mu"), 
      length(unique(student2012.sub.summary.gap.boot$CNT)))
student2012.sub.summary.gap.boot.wide <- 
  student2012.sub.summary.gap.boot %>% 
  spread(ci, value)
student2012.sub.summary.gap <- merge(student2012.stats,
  student2012.sub.summary.gap.boot.wide)
kable(head(student2012.sub.summary.gap))
CNT wmathgap ml mu
ARE -5.1 -9.0 -1.8
AUS 12.3 8.8 15.0
AUT 21.9 17.8 25.8
BEL 11.4 7.9 15.5
BGR -2.1 -6.8 2.3
BRA 21.1 16.3 24.6

Plot

ggplot(data=student2012.sub.summary.gap) + 
  geom_point(aes(x=CNT, y=wmathgap), size=3) + 
  geom_segment(aes(x=CNT, xend=CNT, y=ml, yend=mu)) + 
  coord_flip() + theme_bw() 

Enhance

Match three digit codes to country names, more recognizable labels

student2012.sub.summary.gap$name <- NA
for (i in 1:length(student2012.sub.summary.gap$name))  
  student2012.sub.summary.gap$name[i] <-
  isoToName(as.character(student2012.sub.summary.gap$CNT[i]))
# QCN is Shanghai, not whole of China - 
# Don't know what country TAP is
student2012.sub.summary.gap$name[
  student2012.sub.summary.gap$CNT == "QCN"] <- 
  isoToName("CHN")
student2012.sub.summary.gap$name[
  student2012.sub.summary.gap$CNT == "TAP"] <- 
  "TAP"

Create categorical gap variable to draw attention to significant difference

student2012.sub.summary.gap$wmathgap_cat <- "same"
student2012.sub.summary.gap$wmathgap_cat[
  student2012.sub.summary.gap$ml > 0] <- "boys"
student2012.sub.summary.gap$wmathgap_cat[
  student2012.sub.summary.gap$mu < 0] <- "girls"
kable(head(student2012.sub.summary.gap))
CNT wmathgap ml mu name wmathgap_cat
ARE -5.1 -9.0 -1.8 United Arab Emirates girls
AUS 12.3 8.8 15.0 Australia boys
AUT 21.9 17.8 25.8 Austria boys
BEL 11.4 7.9 15.5 Belgium boys
BGR -2.1 -6.8 2.3 Bulgaria same
BRA 21.1 16.3 24.6 Brazil boys

Order Again

student2012.sub.summary.gap$name <- factor(student2012.sub.summary.gap$name, 
    levels=student2012.sub.summary.gap$name[
      order(student2012.sub.summary.gap$wmathgap)])
kable(head(student2012.sub.summary.gap))
CNT wmathgap ml mu name wmathgap_cat
ARE -5.1 -9.0 -1.8 United Arab Emirates girls
AUS 12.3 8.8 15.0 Australia boys
AUT 21.9 17.8 25.8 Austria boys
BEL 11.4 7.9 15.5 Belgium boys
BGR -2.1 -6.8 2.3 Bulgaria same
BRA 21.1 16.3 24.6 Brazil boys

Plot - with Guide Lines

ggplot(data=student2012.sub.summary.gap) + 
  geom_hline(yintercept=0, colour="grey80") + 
  geom_point(aes(x=name, y=wmathgap, color=wmathgap_cat), 
             size=3) + 
  geom_segment(aes(x=name, xend=name, y=ml, yend=mu, 
                   color=wmathgap_cat)) + 
  coord_flip() + theme_bw() 

Enhance More

  • Labels
  • Axis limits
  • Grid lines
  • Color

ggplot(data=student2012.sub.summary.gap) + 
  geom_hline(yintercept=0, colour="grey80") + 
  geom_point(aes(x=name, y=wmathgap, color=wmathgap_cat), size=3) + 
  geom_segment(aes(x=name, xend=name, y=ml, yend=mu, 
     color=wmathgap_cat)) + xlab("") +  
  scale_colour_manual("", values=c("boys"="skyblue", 
    "girls"="pink", "same"="lightgreen")) +
  scale_y_continuous("Girls <----------> Boys", 
    breaks=seq(-30, 30, 10), limits=c(-35, 35), 
    labels=c(seq(30, 0, -10), seq(10, 30, 10))) + 
  coord_flip() + theme_bw() + 
  theme(axis.text.x = element_text(size=5), 
        axis.text.y = element_text(size=5), 
        axis.title = element_text(size=7), 
        legend.text = element_text(size=5),
        legend.title = element_text(size=5))

Interactive

ggplotly()

Maps

Map data is essentially a set of points, and line segments. You can get maps from various sources, and wrangle the files/data into an R object. This can be merged with data to provide spatial context to problems.

world <- getMap(resolution = "low")
extractPolys <- function(p) {
  polys <- NULL
  for (i in 1:length(p)) {
    for (j in 1:length(p[[i]]@Polygons)) {
      x <- p[[i]]@Polygons[[j]]@coords
      polys$lon <- c(polys$lon, x[,1])
      polys$lat <- c(polys$lat, x[,2])
      polys$ID <- c(polys$ID, rep(p[[i]]@ID, nrow(x)))
      polys$region <- c(polys$region, 
        rep(paste(p[[i]]@ID, j, sep="_"), nrow(x)))
      polys$order <- c(polys$order, 1:nrow(x))
    }
  }
  return(data.frame(polys))
}
polys <- extractPolys(world@polygons)

Here is what is looks like:

kable(head(polys))
lon lat ID region order
-70 12 Aruba Aruba_1 1
-70 12 Aruba Aruba_1 2
-70 12 Aruba Aruba_1 3
-70 12 Aruba Aruba_1 4
-70 13 Aruba Aruba_1 5
-70 13 Aruba Aruba_1 6

A map is a set of points…

ggplot(data=filter(polys, region=="China_1"), 
       aes(x=lon, y=lat)) + 
  geom_point()

connected in the right order

ggplot(data=filter(polys, region=="China_1"), 
       aes(x=lon, y=lat, order=order)) + 
  geom_path()

as a group

ggplot(data=filter(polys, region=="China_1" | region=="Australia_1"), 
       aes(x=lon, y=lat, group=region, order=order)) + 
  geom_path()

Plot all

ggplot(data=polys) + 
  geom_path(aes(x=lon, y=lat, group=region, order=order))

Join education data with map polygons

polys <- polys %>% rename(name = ID)
student2012.sub.map <- left_join(
  student2012.sub.summary.gap, polys)
student2012.sub.map <- student2012.sub.map %>% 
  arrange(region, order)

Map theme

Make it look like a map, by tweaking the plot appearance

theme_map <- theme_bw()
theme_map$line <- element_blank()
theme_map$strip.text <- element_blank()
theme_map$axis.text <- element_blank()
theme_map$plot.title <- element_blank()
theme_map$axis.title <- element_blank()
theme_map$panel.border <- element_rect(
  colour = "grey90", size=1, fill=NA)

Plot - axes, colors, coord system

ggplot(data=polys) + 
  geom_path(aes(x=lon, y=lat, group=region, order=order), 
            colour=I("grey90"), size=0.1) + 
  geom_polygon(data=student2012.sub.map, aes(x=lon, y=lat, 
            group=region, order=order,  
            fill=wmathgap_cat)) +
  scale_fill_manual("Diff>5", values=c("boys"="skyblue", 
                                    "girls"="pink", 
                                    "same"="lightgreen")) + 
  scale_x_continuous(expand=c(0,0)) + 
  scale_y_continuous(expand=c(0,0)) +
  coord_equal() + theme_map 

Interactive

ggplotly()

Raster maps

Image maps can also be pulled using the package ggmap to use as the background for data.

gm <- get_googlemap(center = c(lon=144.96, lat=-37.815), zoom=14)
ggmap(gm) 

sensor_loc <- read_csv("../data/Pedestrian_Sensor_Locations.csv")
ggmap(gm) + geom_point(data=sensor_loc, aes(x=Longitude, y=Latitude))

Your Turn

  • With your neighbor, take one of the problems that you brainstormed on the first day, and work a question through from start to end, summarise.
  • Choices were: economics, gapminder, pedestrian sensor.

Temporal data - Melbourne temperature

melbtemp <- read.fwf("../data/ASN00086282.dly", 
   c(11, 4, 2, 4, rep(c(5, 1, 1, 1), 31)), fill=T)
melbtemp.m <- melbtemp %>%
  select(num_range("V", c(1,2,3,4,seq(5,128,4)))) %>%
  filter(V4 %in% c("PRCP", "TMAX", "TMIN")) %>%
  gather(day, value, V5:V125, na.rm = TRUE) %>%
  spread(V4, value) %>%
  mutate(
    tmin = as.numeric(TMIN) / 10,
    tmax = as.numeric(TMAX) / 10,
    t_range = tmax - tmin,
    prcp = as.numeric(PRCP) / 10
  ) %>%
  rename(stn=V1, year=V2, month=V3)

melbtemp.m$day <- factor(melbtemp.m$day, 
  levels=c("V5","V9","V13","V17","V21","V25","V29",
           "V33","V37","V41","V45","V49","V53","V57",
           "V61","V65","V69","V73","V77","V81","V85",
           "V89","V93","V97","V101","V105","V109",
           "V113","V117","V121","V125"),
  labels=1:31)
melbtemp.m$date <- as.Date(paste(melbtemp.m$day, 
     melbtemp.m$month, melbtemp.m$year, sep="-"),
     "%d-%m-%Y")

What are the temperature patterns in Melbourne over time? Daily resolution: global and seasonal patterns

kable(head(melbtemp.m))
stn year month day PRCP TMAX TMIN tmin tmax t_range prcp date
ASN00086282 1970 7 25 0 158 78 7.8 16 8.0 0.0 1970-07-25
ASN00086282 1970 7 26 13 149 36 3.6 15 11.3 1.3 1970-07-26
ASN00086282 1970 7 27 3 133 61 6.1 13 7.2 0.3 1970-07-27
ASN00086282 1970 7 28 0 143 46 4.6 14 9.7 0.0 1970-07-28
ASN00086282 1970 7 29 25 150 42 4.2 15 10.8 2.5 1970-07-29
ASN00086282 1970 7 30 0 145 63 6.3 14 8.2 0.0 1970-07-30

Oops, didn't handle missings!

ggplot(data=melbtemp.m, aes(x=date, y=tmax)) + geom_point(size=0.1)

Replace -9999 as NAs

melbtemp.m <- melbtemp %>%
  select(num_range("V", c(1,2,3,4,seq(5,128,4)))) %>%
  filter(V4 %in% c("PRCP", "TMAX", "TMIN")) %>%
  gather(day, value, V5:V125, na.rm = TRUE) %>%
  mutate(value=ifelse(value<(-9000), NA, value)) %>%
  spread(V4, value) %>%
  mutate(
    tmin = as.numeric(TMIN) / 10,
    tmax = as.numeric(TMAX) / 10,
    t_range = tmax - tmin,
    prcp = as.numeric(PRCP) / 10
  ) %>%
  rename(stn=V1, year=V2, month=V3)

Plot

ggplot(data=melbtemp.m, aes(x=date, y=tmax)) + geom_point(size=0.1)

Aspect ratio

Aspect ratio emphasizes global trend

ggplot(data=melbtemp.m, aes(x=date, y=tmax)) + 
  geom_point(size=0.5, alpha=0.4) +
  theme_bw() + theme(aspect.ratio=2)

Aspect ratio

Aspect ratio emphasizes seasonality

ggplot(data=melbtemp.m, aes(x=date, y=tmax)) + 
  geom_point(size=0.1) +
  theme(aspect.ratio=0.1)

Seasonal

ggplot(data=melbtemp.m, aes(x=factor(month), y=tmax)) + 
  geom_boxplot() +
  theme(aspect.ratio=0.3)

Cooler in June-July. Variability is higher in the summer.

Global trend aggregate

Long time series make it difficult to examine trend. Aggregate on year, and use the min and max values.

melbtemp.m.r <- melbtemp.m %>% 
  group_by(year) %>%
  summarise(tmin=min(tmin, na.rm=TRUE), 
            tmax=max(tmax, na.rm=TRUE)) 

Global Trend

ggplot(data=melbtemp.m.r, aes(x=year, ymin=tmin, ymax=tmax)) + 
  geom_ribbon(alpha=0.5) + 
  geom_smooth(aes(x=year, y=tmax), se=F, method="lm", colour="black") +
  geom_smooth(aes(x=year, y=tmin), se=F, method="lm", colour="black") +
  theme(aspect.ratio=1) + xlab("") + ylab("Temperature")

Most years similar

ggplot(data=filter(melbtemp.m, year > 1990, year < 2011), 
       aes(x=factor(month), y=tmax)) + 
  geom_boxplot() + facet_wrap(~year) +
  theme(aspect.ratio=0.3)

Your Turn

Examine global and seasonal patterns of precipitation in Melbourne

Multiple Plots on a Sheet

Occasionally you would like to organize your plots in special ways. The gridExtra can be used to take individual plots and lay them out together.

p1 <- ggplot(data=subset(CO2.all, flg < 2), 
             aes(x=date, y=co2, colour=stn)) +
  geom_line() + xlab("Year") + ylab("CO2 (ppm)") + 
        facet_wrap(~stn, ncol=1) + 
  theme(axis.text.y=element_text(size = 6), legend.position="none")
p2 <- ggplot(data=subset(CO2.all, flg < 2), 
             aes(date, co2, colour=stn)) +
  geom_line() + xlab("Year") + ylab("CO2 (ppm)") + 
  theme(axis.text.y=element_text(size = 6), legend.position="none")
p3 <- ggplot(data=polys) + 
  geom_path(aes(x=lon, y=lat, group=region, order=order), 
            colour=I("grey60"), size=0.1) + 
  geom_point(data=CO2.all.loc, aes(x=lon, y=lat, group=1), colour="red", 
                      size=2, alpha=0) +
  geom_text(data=CO2.all.loc, aes(x=lon, y=lat, label=stn, group=1), 
            colour="orange", size=5) +
  coord_equal() + theme_map 

grid.arrange(p1, p2, p3, layout_matrix = rbind(c(1,2),c(3,3)))

Credit